rm(list=ls())

library(readr)  # for read_csv()
library(rnaturalearth)
library(ggplot2)
library(tidyr)
library(ggthemes)
#library(stringr)
library(tidyverse)
library(patchwork)
library(readxl)

year_range <- 2000:2018
dir_fao <- '/Users/meijian/Work/SIMPLE20181102/Observation/FAO_Africa/yield/'
dir_fao_area <- '/Users/meijian/Work/SIMPLE20181102/Observation/FAO_Africa/area/'
filename <- '/Users/meijian/Work/Columbia_postdoc/VACS/fao_sim_africa_compare.csv'
workDir <- "/Users/meijian/Work/SIMPLE20181102/"

df_fao_sim <- as.data.frame(read_csv(filename, show_col_types = FALSE))
column_names <- names(df_fao_sim)

data_source <- c('FAO', 'SIM')

species_list <- c("africaneggplant", "bambaragroundnut", "cassava", "cocoyam", "cowpea", "fingermillet", "fonio", 
                  "grasspea", "groundnut", "josephscoat", "lablab", "maize", "mungbean", "okra", "pearlmillet", "pigeonpea", 
                  "plantain", "pumpkin", "sesame", "sorghum", "soybean", "sweetpotato", "taro", "tef", "tomato", "yams")

crop_name_link <- list("Bambara groundnut" = "bambaragroundnut", "Cassava" = "cassava", "Cowpea" = "cowpea", "Fonio" = "fonio", 
                  "Groundnut" = "groundnut", "Maize" = "maize", "Okra" = "okra", "Pearl millet" = "pearlmillet", "Pigeon pea" = "pigeonpea", "Sesame" = "sesame", 
                  "Sorghum" = "sorghum", "Soybean" = "soybean", "Sweet potato" = "sweetpotato", "Taro" = "taro", "Tomato" = "tomato", "Yams" = "yams")

speciesNum <- length(species_list)

# Get world map data
africa_map <- ne_countries(continent = "Africa", returnclass = "sf")
africa_map$sovereignt[africa_map$sovereignt == "eSwatini"] <- "Eswatini"
africa_map$sovereignt[africa_map$sovereignt == "Ivory Coast"] <- "Côte d'Ivoire"

# grid info
grid_list <- paste0(workDir, "Input/Map/SIMPLE-All_Africa_Countries_List.xlsx")
grid_info <- read_excel(grid_list)
grid_info$Country[grid_info$Country == "Guinea Bissau"] <- "Guinea-Bissau"
grid_info$Country[grid_info$Country == "Swaziland"] <- "Eswatini"
grid_info$Country[grid_info$Country == "Ivory Coast"] <- "Côte d'Ivoire"
grid_info$Country[grid_info$Country == "S. Sudan"] <- "South Sudan"
country_list <- unique(grid_info$Country)
country_list <- country_list[!country_list %in% c("Egypt", "Djibouti")] # remove Egypt and Djibouti due to 100% irrigation

filterByCROPGRIDS <- function(inPath, species, country_data) {
  library(ncdf4)
  library(raster)
  
  filtered_data <- data.frame()
  area_threshold <- 5
  mask_raster <- raster(xmn=min(country_data$long)-0.25, xmx=max(country_data$long)+0.25,
                        ymn=min(country_data$lat)-0.25, ymx=max(country_data$lat)+0.25,
                        res= 0.5, crs="+proj=longlat +datum=WGS84")
  
  HarvestAreaFile <- paste0(inPath, "Input/Map/CROPGRIDSv1.05_NC_maps/", "CROPGRIDSv1.05_", species, ".nc")
  
  if (file.exists(HarvestAreaFile)){
    nc_data <- nc_open(HarvestAreaFile)
    lon <- ncvar_get(nc_data, "lon")
    lat <- ncvar_get(nc_data, "lat")
    ha <- ncvar_get(nc_data, "harvarea")
    nc_close(nc_data)
    
    rownames(ha) <- lon
    colnames(ha) <- lat
    
    ha_df <- as.data.frame.table(ha)
    colnames(ha_df) <- c('lon','lat','ha')
    ha_df$lon <- as.numeric(as.character(ha_df$lon))
    ha_df$lat <- as.numeric(as.character(ha_df$lat))
    # convert Ocean = -2, Not assessed = -1 to 0
    ha_df$ha[ha_df$ha < 0] <- 0
    
    mask_ha <- rasterize(ha_df[,c('lon','lat')], mask_raster, ha_df[,'ha'], fun=sum, na.rm=T)
    filterTable_data <- data.frame(cbind(xyFromCell(mask_ha, 1:ncell(mask_ha)), round(values(mask_ha),2)))
    colnames(filterTable_data) <- c('long','lat','Area_sum')
    
    filter_data <- subset(filterTable_data, Area_sum >= area_threshold)
    
    filtered_data <- merge(country_data, filter_data,  by = c("long", "lat"), all.x = FALSE, sort=FALSE)
  }
  else {stop(paste0("AutoGenGridInFile.R: Can't find CROPGRIDS data for ",species,"!"))}
  
  return(filtered_data)
}

harvestArea <- list()
gcm <- "ERA5" #baseline
out_summary <- data.frame(crop = as.character(NA), country = as.character(NA), gcm = as.character(NA), year = as.integer(NA), 
                          yield = as.numeric(NA), biomass = as.numeric(NA), duration = as.numeric(NA), harvest_area = as.numeric(NA)) 
out_summary_prod <- data.frame(crop = as.character(NA), country = as.character(NA), gcm = as.character(NA), year = as.integer(NA), 
                               production = as.numeric(NA), biomass = as.numeric(NA), duration = as.numeric(NA)) 

for (species in species_list) {
  harvestArea <- filterByCROPGRIDS(workDir, species, grid_info)

  pattern <- paste0(species, "_baseline.*\\.csv$")
  files <- list.files(path = paste0(workDir, "Simple_output_discover/26crops"), pattern = pattern, full.names = TRUE)
  sim_results <- list()
  for (file in files) {
    sim_results <- c(sim_results, list(read.table(file, header=TRUE, sep=",", stringsAsFactors=FALSE)))
  }
  
  for (country in country_list) {
    harvestArea_country <- harvestArea[harvestArea$Country == country,]
    if (nrow(harvestArea_country) == 0) {next}
    else{
      for (i in 1:length(sim_results)) {
        sim_results_model <- sim_results[[i]]
        for (year in year_range) {
          yield_ha_country <- merge(harvestArea_country, sim_results_model[sim_results_model$SowingYear == year, ], by = c("long", "lat"))
          out_onerow <- as.data.frame(t(c(species, country, gcm[i], year, sum(yield_ha_country$Area_sum * yield_ha_country$Yield) / sum(yield_ha_country$Area_sum), 
                                          sum(yield_ha_country$Area_sum * yield_ha_country$Biomass) / sum(yield_ha_country$Area_sum), 
                                          sum(yield_ha_country$Area_sum * yield_ha_country$Duration) / sum(yield_ha_country$Area_sum),
                                          sum(yield_ha_country$Area_sum))))
          colnames(out_onerow) <- c("crop","country","gcm","year","yield","biomass","duration","harvest_area")
          out_summary <- rbind(out_summary, out_onerow)
          
          #production
          out_onerow_prod <- as.data.frame(t(c(species, country, gcm[i], year, sum(yield_ha_country$Area_sum * yield_ha_country$Yield), 
                                               sum(yield_ha_country$Area_sum * yield_ha_country$Biomass), 
                                               sum(yield_ha_country$Area_sum * yield_ha_country$Duration))))
          colnames(out_onerow_prod) <- c("crop","country","gcm","year","production","biomass","duration")
          out_summary_prod <- rbind(out_summary_prod, out_onerow_prod)
        }
      }
    }
  }
}
out_summary <- out_summary[-1,]
# write.csv(out_summary, file = "./Simple_output_discover/23crops_country.csv", row.names = FALSE)
# Read data from a CSV file into a dataframe
# out_summary <- read.csv(file = "./Simple_output_discover/23crops_country.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
out_summary$yield <- as.numeric(out_summary$yield)
out_summary$yield[is.nan(out_summary$yield)] <- NA
out_summary$biomass <- as.numeric(out_summary$biomass)
out_summary$biomass[is.nan(out_summary$biomass)] <- NA
out_summary$duration <- as.numeric(out_summary$duration)
out_summary$duration[is.nan(out_summary$duration)] <- NA
out_summary$harvest_area <- as.numeric(out_summary$harvest_area)
out_summary$harvest_area[is.nan(out_summary$harvest_area)] <- NA

## plot
df_violin <- data.frame(violin_value = as.numeric(NA), violin_crops = as.character(NA), violin_source = as.character(NA))
for (i in 1:((dim(df_fao_sim)[2]-1)/2)) {
  crop_name <- substr(column_names[2*i], 1, nchar(column_names[2*i]) - 4)
  
  ## fao
  data_fao <- as.data.frame(read_csv(paste0(dir_fao, gsub(" ", "", crop_name),'.csv'), show_col_types = FALSE)) # gsub removes spaces
  data_fao_area <- as.data.frame(read_csv(paste0(dir_fao_area, gsub(" ", "", crop_name),'.csv'), show_col_types = FALSE)) # gsub removes spaces
#  df_long <- df_fao_sim[, c(1, 2*i, 2*i+1)] %>% pivot_longer(cols = -Year, names_to = "crop", values_to = "value")
  data_fao$Area[data_fao$Area == "Ethiopia PDR"] <- "Ethiopia"
  
  pivot_table_fao <- data_fao %>%
    group_by(Area, Year) %>%
    summarise(Average = mean(Value), .groups = 'drop') %>%
    pivot_wider(names_from = Area, values_from = Average)
  
  pivot_table_fao_area <- data_fao_area %>%
    group_by(Area, Year) %>%
    summarise(Average = mean(Value), .groups = 'drop') %>%
    pivot_wider(names_from = Area, values_from = Average)
  
  pivot_table_fao_baseline <- pivot_table_fao[pivot_table_fao$Year>=2000 & pivot_table_fao$Year<=2018,-1]/10
  pivot_table_fao_area_baseline <- pivot_table_fao_area[pivot_table_fao_area$Year>=2000 & pivot_table_fao_area$Year<=2018,-1]
  pivot_table_fao_area_baseline <- pivot_table_fao_area_baseline[, names(pivot_table_fao_area_baseline) %in% names(pivot_table_fao_baseline)]
  
  ## sim
  out_summary_crop <- out_summary[out_summary$crop == crop_name_link[crop_name] & out_summary$year>=2000 & out_summary$year<=2018,]
  # country_list <- unique(out_summary_crop$country)
  
  pivot_table_sim <- out_summary_crop %>%
    group_by(country, year) %>%
    summarise(Average = mean(yield), .groups = 'drop') %>%
    pivot_wider(names_from = country, values_from = Average)
  
  pivot_table_sim_area <- out_summary_crop %>%
    group_by(country, year) %>%
    summarise(Average = mean(harvest_area), .groups = 'drop') %>%
    pivot_wider(names_from = country, values_from = Average)
  
  pivot_table_sim <- pivot_table_sim[,-1]
  pivot_table_sim_area <- pivot_table_sim_area[,-1]
  
  ## calculate mean, median, max, min, std
  fao_min_country <- apply(pivot_table_fao_baseline, 1, min, na.rm = TRUE)
  fao_max_country <- apply(pivot_table_fao_baseline, 1, max, na.rm = TRUE)
  fao_median_country <- apply(pivot_table_fao_baseline, 1, median, na.rm = TRUE)
  fao_std_country <- apply(pivot_table_fao_baseline, 1, sd, na.rm = TRUE)
  fao_mean_country <- apply(pivot_table_fao_baseline * pivot_table_fao_area_baseline, 1, sum, na.rm = TRUE) / apply(pivot_table_fao_area_baseline, 1, sum, na.rm = TRUE)
  
  sim_min_country <- apply(pivot_table_sim, 1, min, na.rm = TRUE)
  sim_max_country <- apply(pivot_table_sim, 1, max, na.rm = TRUE)
  sim_median_country <- apply(pivot_table_sim, 1, median, na.rm = TRUE)
  sim_std_country <- apply(pivot_table_sim, 1, sd, na.rm = TRUE)
  sim_mean_country <- apply(pivot_table_sim * pivot_table_sim_area, 1, sum, na.rm = TRUE) / apply(pivot_table_sim_area, 1, sum, na.rm = TRUE)
  
  if (crop_name == 'Tomato') {
    fao_min_country <- fao_min_country * 0.15
    fao_max_country <- fao_max_country * 0.15
    fao_median_country <- fao_median_country * 0.15
    fao_mean_country <- fao_mean_country * 0.15
    fao_std_country <- fao_std_country * 0.15
    }
  
  Mean_fao = fao_mean_country
  Median_fao = fao_median_country
  Min_fao = Mean_fao - fao_std_country #fao_min_country/10,
  Max_fao = Mean_fao + fao_std_country #fao_max_country/10,
  
  Mean_sim = sim_mean_country
  Median_sim = sim_median_country
  Min_sim = Mean_sim - sim_std_country #as.numeric(trimws(sim_min_country)),
  Max_sim = Mean_sim + sim_std_country #as.numeric(trimws(sim_max_country)),
  
  data_plot <- data.frame(
    Year = year_range,
    
    Mean_fao = Mean_fao,
    Median_fao = Median_fao,
    Min_fao = Min_fao, #fao_min_country/10,
    Max_fao = Max_fao, #fao_max_country/10,
    
    Mean_sim = Mean_sim,
    Median_sim = Median_sim,
    Min_sim = Min_sim, #as.numeric(trimws(sim_min_country)),
    Max_sim = Max_sim #as.numeric(trimws(sim_max_country)),
  )
  
  # Set up the plot with ggplot
  p1 <- ggplot(data_plot, aes(x = Year)) +
    geom_ribbon(aes(ymin = Min_fao, ymax = Max_fao), fill = "skyblue", alpha = 0.5, show.legend = FALSE) +  # Shaded area between Min and Max
    geom_ribbon(aes(ymin = Min_sim, ymax = Max_sim), fill = "darkorange", alpha = 0.5, show.legend = FALSE) +  # Shaded area between Min and Max
    geom_line(aes(y = Mean_fao, color = "FAO Mean", linetype = "FAO Mean"), size = 1.5) +
    geom_line(aes(y = Mean_sim, color = "SIM Mean", linetype = "SIM Mean"), size = 1.5) +
    geom_line(aes(y = Median_fao, color = "FAO Median", linetype = "FAO Median"), size = 1.5) +
    geom_line(aes(y = Median_sim, color = "SIM Median", linetype = "SIM Median"), size = 1.5) +
    scale_x_continuous(breaks = seq(min(data_plot$Year, na.rm = TRUE), max(data_plot$Year, na.rm = TRUE), by = 2)) +
    labs(title = substr(column_names[2*i], 1, nchar(column_names[2*i]) - 4), x = "Year", y = "Yield (kg/ha)") +
    scale_color_manual(values = c("FAO Mean" = "#318AD0", "SIM Mean" = "#E66A1F", "FAO Median" = "#318AD0", "SIM Median" = "#E66A1F"), name = "") +
    scale_linetype_manual(values = c("FAO Mean" = "solid", "SIM Mean" = "solid", "FAO Median" = "dotted", "SIM Median" = "dotted"), name = "") +
    theme_classic() +
    theme(
      #legend.text = element_text(size = 8), 
      legend.key.size = unit(1.3, "cm")  # Adjust the size of the legend keys
    )
  
  p1 <- p1 + theme(legend.position = "none")
  #ggsave(paste0("/Users/meijian/Work/Columbia_postdoc/VACS/fao_sim_line_compare/",crop_name,".png"), p1, width = 7, height = 4, units = "in", dpi = 300) # single plot
  
  violin_value <- c(unlist(pivot_table_fao_baseline, use.names = FALSE), unlist(pivot_table_sim[, names(pivot_table_sim) %in% names(pivot_table_fao_baseline)], use.names = FALSE))
  violin_crops <- rep(crop_name, length(violin_value))
  violin_source <- c(rep("FAO", length(unlist(pivot_table_fao_baseline, use.names = FALSE))), rep("SIM", length(unlist(pivot_table_sim[, names(pivot_table_sim) %in% names(pivot_table_fao_baseline)], use.names = FALSE))))

  df_onecrop_violin <- data.frame(violin_value = violin_value, violin_crops = violin_crops, violin_source = violin_source)
  
  df_onecrop_violin$violin_source <- as.factor(df_onecrop_violin$violin_source)
  theme_set(theme_classic() + theme(legend.position = "top"))
  
  # Initiate a ggplot
  p2 <- ggplot(df_onecrop_violin, aes(x = violin_crops, y = violin_value, fill = violin_source)) + 
    geom_violin(aes(color = violin_source), trim = FALSE, alpha = .2, position = position_dodge(0.9), show.legend = FALSE) +
    geom_boxplot(width = 0.15, alpha = 1, position = position_dodge(0.9), color = "grey40") +
    # stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "red", position = position_dodge(0.9)) +  # Add mean points
    labs(x = "", y = "", title = "Yield\n(kg/ha)") +
    ggtitle(crop_name) +
    # scale_color_manual(values = c("#f8766d", "#00bfc4"), name = "Yield\n(kg/ha)") +
    theme(plot.title = element_text(size = 12, color = "black", hjust = 0.5),
          panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.9),
          legend.title = element_text(size = 14), 
          legend.text = element_text(size = 14),
          legend.key.size = unit(0.8, "cm"),
          axis.text.x = element_blank(),  # Hide x-axis labels
          axis.ticks.x = element_blank()) + # Remove x-axis ticks
    guides(fill = guide_legend(title = "Yield\n(kg/ha)", override.aes = list(size = 4)))
  
  assign(paste0(crop_name_link[crop_name]), p2)
  
  #ggsave(paste0("/Users/meijian/Work/Columbia_postdoc/VACS/fao_sim_violin_compare/fao_sim_violin_compare_",crop_name,".png"), p2, width = 4, height = 6, units = "in", dpi = 300) # single plot
  
  df_violin <- rbind(df_violin, df_onecrop_violin)
}

# Create an invisible plot with the same dimensions and theme settings as p2
# invisible_plot <- ggplot() + 
#   theme_void() + 
#   theme(plot.background = element_blank(), panel.grid = element_blank(), panel.border = element_blank(),
#         plot.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "cm")) +
#   labs(title = " ")
# 
# layout <- (fonio + maize + pearlmillet + sorghum) /
#   (bambaragroundnut + cowpea + pigeonpea + soybean) /
#   (okra + tomato + {plot_spacer()} + {plot_spacer()}) /
#   (cassava + sweetpotato + taro + yams) /
#   (groundnut + sesame + {plot_spacer()} + {plot_spacer()})  + 
#   plot_annotation(tag_levels = list(c('a.Cereals','','','',
#                                       'b.Legumes','','','',
#                                       'c.Vegetables','','','',
#                                       'd.Roots/Tubers','','','',
#                                       'e.Oilseeds','','','')))
# 
# combined_fig <- layout + 
#   plot_layout(guides = "collect") & 
#   theme(legend.position = "right")

layout <- '
ABCD
EFGH
IJ##
KLMN
OP##
'
combined_fig <- wrap_plots(
           A = fonio, B = maize, C = pearlmillet, D = sorghum, 
           E = bambaragroundnut, F = cowpea, G = pigeonpea, H = soybean, 
           I = okra, J = tomato,
           K = cassava, L = sweetpotato, M = taro, N = yams, 
           O = groundnut, P = sesame,
           design = layout) +
          plot_annotation(tag_levels = list(
                                      c('a. Cereals','','','',
                                      'b. Legumes','','','',
                                      'c. Vegetables','',
                                      'd. Roots/Tubers','','','',
                                      'e. Oilseeds',''))) +
          plot_layout(guides = "collect", axis_titles = "collect_y", widths = c(1, 1, 1, 1)) &
          theme(legend.position = "right", plot.tag.position = c(0.17,1.02), plot.tag = element_text(face = 'bold',size = 12))
ggsave(paste0("/Users/meijian/Work/Columbia_postdoc/VACS/fao_sim_violin_compare/fao_sim_violin_compare_all_combined.png"), combined_fig, width = 13, height = 12, units = "in", dpi = 600)

combined_fig2 <- fonio + cowpea + yams + tomato + plot_layout(guides = "collect", axis_titles = "collect_y", widths = c(1, 1))  &
  theme(legend.position = "right", plot.tag = element_text(face = 'bold',size = 12))
ggsave(paste0("/Users/meijian/Work/Columbia_postdoc/VACS/fao_sim_violin_compare/fao_sim_violin_compare_represent_crops.png"), combined_fig2, width = 7, height = 5, units = "in", dpi = 600)

# combined_fig <- (fonio | maize | pearlmillet | sorghum) / 
#   (bambaragroundnut | cowpea | pigeonpea | soybean) /
#   (okra | tomato | plot_spacer() | plot_spacer()) /
#   (cassava | sweetpotato | taro | yams) /
#   (groundnut | sesame | plot_spacer() | plot_spacer())  + 
#   plot_annotation(tag_levels = list(c('a. Cereals','','','',
#                                       'b. Legumes','','','',
#                                       'c. Vegetables','',
#                                       'd. Roots/Tubers','','','',
#                                       'e. Oilseeds',''))) +
#   plot_layout(guides = "collect", axis_titles = "collect_y", widths = c(1, 1, 1, 1)) &
#   theme(legend.position = "right", plot.tag = element_text(face = 'bold',size = 12))
# ggsave(paste0("/Users/meijian/Work/Columbia_postdoc/VACS/fao_sim_violin_compare/fao_sim_violin_compare_all_combined.png"), combined_fig, width = 13, height = 12, units = "in", dpi = 600)

df_violin <- df_violin[-1,]

##box/violin plot
df_violin$violin_source <- as.factor(df_violin$violin_source)
theme_set(theme_classic() + theme(legend.position = "top"))
# Initiate a ggplot
p <- ggplot(df_violin, aes(x = violin_crops, y = violin_value)) + 
  geom_violin(aes(color = violin_source), trim = FALSE, position = position_dodge(0.9) ) +
  geom_boxplot(aes(color = violin_source), width = 0.15, position = position_dodge(0.9)) +
  labs(x = "", y = "Yield (kg/ha)") +
  scale_color_manual(values = c("skyblue", "darkorange"), name = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
ggsave(paste0("/Users/meijian/Work/Columbia_postdoc/VACS/fao_sim_violin_compare/fao_sim_violin_compare_all.png"), p, width = 12, height = 6, units = "in", dpi = 300) # single plot
